library(data.table)
library(tidyverse)
library(dplyr)
library(plotly)
library(knitr)
opts_chunk$set(
warning = FALSE,
message = FALSE,
eval=TRUE,
echo = TRUE,
fig.width = 7,
fig.align = 'center',
fig.asp = 0.618,
out.width = "700px")
plot_ly() and ggplotly() functionsplot_geo()DataTableWe will work with the COVID data presented in lecture. Recall the dataset consists of COVID-19 cases and deaths in each US state during the course of the COVID epidemic. We will explore cases, deaths, and their population normalized values over time to identify trends.
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data
## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data
# load COVID state-level data from NYT
### FINISH THE CODE HERE ###
cv_states <- as.data.frame(fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
# load state population data
### FINISH THE CODE HERE ###
state_pops <- as.data.frame(fread( "https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
#MERGE TWO DATA FRAME
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
### FINISH THE CODE HERE ###
cv_states <- merge(cv_states,state_pops,by = "state")
head, and tail of the datadim(cv_states)
## [1] 12542 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2020-08-26 1 119254 2045 1 4887871 96.50939 AL
## 2 Alabama 2020-09-08 1 133606 2277 1 4887871 96.50939 AL
## 3 Alabama 2020-07-31 1 87723 1580 1 4887871 96.50939 AL
## 4 Alabama 2020-07-16 1 61088 1230 1 4887871 96.50939 AL
## 5 Alabama 2020-05-25 1 14986 566 1 4887871 96.50939 AL
## 6 Alabama 2020-08-25 1 117242 2037 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 12537 Wyoming 2020-05-27 56 860 14 56 577737 5.950611 WY
## 12538 Wyoming 2020-09-10 56 4199 42 56 577737 5.950611 WY
## 12539 Wyoming 2020-07-18 56 2108 24 56 577737 5.950611 WY
## 12540 Wyoming 2020-08-12 56 3086 29 56 577737 5.950611 WY
## 12541 Wyoming 2020-09-09 56 4151 42 56 577737 5.950611 WY
## 12542 Wyoming 2020-06-10 56 980 18 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 12542 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2020-08-26" "2020-09-08" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 119254 133606 87723 61088 14986 117242 132973 14149 10164 105557 ...
## $ deaths : int 2045 2277 1580 1230 566 2037 2276 549 403 1890 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
state and abb into a factor variable# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state variable
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
# format the state abbreviation (abb) variable
### FINISH THE CODE HERE ###
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 12542 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 183 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 190 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 230 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 133 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 66 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 39 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 12497 Wyoming 2020-10-23 56 10545 68 56 577737 5.950611 WY
## 12389 Wyoming 2020-10-24 56 10805 68 56 577737 5.950611 WY
## 12404 Wyoming 2020-10-25 56 11041 68 56 577737 5.950611 WY
## 12342 Wyoming 2020-10-26 56 11477 77 56 577737 5.950611 WY
## 12461 Wyoming 2020-10-27 56 11806 77 56 577737 5.950611 WY
## 12477 Wyoming 2020-10-28 56 12146 77 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 183 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 190 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 230 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 133 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 66 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 39 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 282 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 279 1st Qu.:2020-05-01 1st Qu.:16.00 1st Qu.: 3323
## California : 278 Median :2020-06-30 Median :29.00 Median : 20154
## Arizona : 277 Mean :2020-06-29 Mean :29.77 Mean : 66536
## Massachusetts: 271 3rd Qu.:2020-08-29 3rd Qu.:44.00 3rd Qu.: 76971
## Wisconsin : 267 Max. :2020-10-28 Max. :72.00 Max. :931113
## (Other) :10888
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 76 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 54.956
## Median : 532 Median :29.00 Median : 4468402 Median : 107.860
## Mean : 2291 Mean :29.77 Mean : 6560815 Mean : 420.682
## 3rd Qu.: 2250 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :33107 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :230
## abb
## WA : 282
## IL : 279
## CA : 278
## AZ : 277
## MA : 271
## WI : 267
## (Other):10888
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2020-10-28"
summary(cv_states$deaths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 76 532 2291 2250 33107
summary(cv_states$cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 3323 20154 66536 76971 931113
new_cases and new_deaths and correct outliersAdd variables for new cases, new_cases, and new deaths, new_deaths:
new_cases is equal to the difference between cases on date i and date i-1, starting on date i=2Use plotly for EDA: See if there are outliers or values that don’t make sense for new_cases and new_deaths. Which states and which dates have strange values?
Correct outliers: Set negative values for new_cases or new_deaths to 0
Recalculate cases and deaths as cumulative sum of updates new_cases and new_deaths
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
#### FINISH THE CODE HERE ###
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Inspect outliers in new_cases and new_deaths using plotly
### FINISH THE CODE HERE ###
p1<-ggplot(cv_states, aes( x = date, y= new_cases, color=state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
### FINISH THE CODE HERE ###
p2<-ggplot(cv_states, aes(x=date,y=new_deaths, color=state)) +
geom_line() +
geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updates `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
Add population-normalized (by 100,000) variables for each variable type (rounded to 1 decimal place). Make sure the variables you calculate are in the correct format (numeric). You can use the following variable names:
per100k = cases per 100,000 populationnewper100k= new cases per 100,000deathsper100k = deaths per 100,000newdeathsper100k = new deaths per 100,000Add a “naive CFR” variable representing deaths / cases on each date for each state
Create a dataframe representing values on the most recent date, cv_states_today, as done in lecture
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` variable
### FINISH THE CODE HERE ###
max_date <- max(cv_states$date)
cv_states_today = cv_states %>% filter (date==as.Date(max_date))
plot_ly()plot_ly() representing pop_density vs. various variables (e.g. cases, per100k, deaths, deathsper100k) for each state on most recent date (cv_states_today)
hovermode = "compare"# pop_density vs. cases
### FINISH THE CODE HERE ###
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_scatter <- cv_states_today %>% filter(state!="District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_scatter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper 100k
cv_states_today_scatter %>%
plot_ly(x = ~pop_density, y = ~newdeathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_scatter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = 'scatter', mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70),
marker = list(sizemode='diameter', opacity=0.5),
hoverinfo = 'text',
text = ~paste( paste(state, ":", sep=""),
paste(" Cases per 100k: ", per100k, sep="") ,
paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",yaxis = list(title = "Deaths per 100k"),
xaxis = list(title = "Population Density"),
hovermode = "compare")
ggplotly() and geom_smooth()pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly()
geom_*() we need here?geom_smooth()
pop_density is a correlate of newdeathsper100k?### FINISH THE CODE HERE ###
p <- ggplot(cv_states_today_scatter, aes(x=pop_density, y=newdeathsper100k, size=population)) + geom_point()+ geom_smooth()
ggplotly(p)
naive_CFR for all states over time using plot_ly()
naive_CFR for the states that had a “first peak” in September. How have they changed over time?new_cases and new_deaths together in one plot. Hint: use add_lines()
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
# Line chart for Texas showing new_cases and new_deaths together
### FINISH THE CODE HERE ###
cv_states %>% filter(state=="Texas") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>% add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines")
-Some states have earlier “first peak” of CFR at March,such as South Dakota,Washington,Florida. Most of the states have a later peak time at May. -From the line for Texas, the peak of new cases is around second week of July and the peak of is around the end of July.
Create a heatmap to visualize new_cases for each state on each date greater than April 1st, 2020 - Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes - Use plot_ly() to create a heatmap out of this matrix - Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases)%>% filter(date>as.Date("2020-04-01"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2020-04-01"), as.Date("2020-10-01"), by="2 weeks")
### FINISH THE CODE HERE ###
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% filter(date %in% filter_dates )
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)
naive_CFR by state on May 1st, 2020naive_CFR by state on most recent datesubplot(). Make sure the shading is for the same range of values (google is your friend for this)### For May 1 2020
# Extract the data for each state by its abbreviation
cv_CFR <- cv_states %>% filter(date=="2020-05-01") %>% select(state, abb, naive_CFR, cases, deaths) # select data
cv_CFR$state_name <- cv_CFR$state
cv_CFR$state <- cv_CFR$abb
cv_CFR$abb <- NULL
# Create hover text
cv_CFR$hover <- with(cv_CFR, paste(state_name, '<br>', "CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 9
# Create the map
fig <- plot_geo(cv_CFR, locationmode = 'USA-states') %>%
add_trace(
z = ~naive_CFR, text = ~hover, locations = ~state,
color = ~naive_CFR, colors = 'Purples'
)
fig <- fig %>% colorbar(title = "CFR", limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('CFR by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
fig_May1 <- fig
#############
### For Today
# Extract the data for each state by its abbreviation
cv_CFR <- cv_states_today %>%
select(state, abb, naive_CFR, cases, deaths) # select data
cv_CFR$state_name <- cv_CFR$state
cv_CFR$state <- cv_CFR$abb
cv_CFR$abb <- NULL
# Create hover text
cv_CFR$hover <- with(cv_CFR, paste(state_name, '<br>', "CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Create the map
fig <- plot_geo(cv_CFR, locationmode = 'USA-states') %>%
add_trace(
z = ~naive_CFR, text = ~hover, locations = ~state,
color = ~naive_CFR, colors = 'Purples'
)
fig <- fig %>% colorbar(title = "CFR Today", limits = c(0,shadeLimit))
fig <- fig %>%
layout(
title = paste('CFR by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
fig_Today <- fig
### Plot side by side
### FINISH THE CODE HERE ###
subplot(fig_May1,fig_Today)